{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Find ICD9 Codes for Hospital admission\n",
    "======================================"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def normalise_icd9_code(code) :\n",
    "    return code[:3] + '.' + code[3:]\n",
    "\n",
    "def combine_icd9_codes(groupframe) :\n",
    "    groupframe = groupframe.sort_values(by='SEQ_NUM')\n",
    "    icd9_codes = ';'.join([normalise_icd9_code(x) for x in list(groupframe.ICD9_CODE)])\n",
    "    return pd.Series({'ICD9_CODE' : icd9_codes})\n",
    "\n",
    "# Replace the path with DIAGNOSES_ICD.csv file on your machine.\n",
    "df_icd9_codes = pd.read_csv('../../../bigdata/MIMIC/DIAGNOSES_ICD.csv').dropna()\n",
    "df_icd9_codes = df_icd9_codes.groupby(['SUBJECT_ID', 'HADM_ID']).apply(combine_icd9_codes)\n",
    "df_icd9_codes = pd.DataFrame(df_icd9_codes).reset_index()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Clean Discharge Summaries\n",
    "========================="
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Replace the path with NOTEEVENTS.csv file on your machine\n",
    "df_notes = pd.read_csv('../../../bigdata/MIMIC/NOTEEVENTS.csv')\n",
    "\n",
    "columns_to_keep = ['SUBJECT_ID', 'HADM_ID', 'CHARTDATE', 'DESCRIPTION', 'TEXT']\n",
    "df_notes_discharge = df_notes[(df_notes.CATEGORY == 'Discharge summary') & (pd.isnull(df_notes.ISERROR))][columns_to_keep]\n",
    "df_notes_discharge['DESCRIPTION'] = df_notes_discharge['DESCRIPTION'].replace({'Report' : 0, 'Addendum' : 1})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def group_text_reports(groupframe) :\n",
    "    #Combine main report and addenda\n",
    "    groupframe = groupframe.sort_values(by=['DESCRIPTION', 'CHARTDATE'])\n",
    "    concat_text = \" \".join(groupframe['TEXT']).strip()\n",
    "    return pd.Series({'TEXT' : concat_text})\n",
    "\n",
    "df_notes_discharge_combined = df_notes_discharge.groupby(['SUBJECT_ID', 'HADM_ID']).apply(group_text_reports)\n",
    "df_notes_discharge_combined = pd.DataFrame(df_notes_discharge_combined).reset_index()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from Transparency.preprocess.vectorizer import cleaner_mimic\n",
    "from tqdm import tqdm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "texts = list(df_notes_discharge_combined['TEXT'])\n",
    "\n",
    "from multiprocessing import Pool\n",
    "with Pool(5) as p :\n",
    "    cleaned_texts = list(tqdm(p.imap(cleaner_mimic, texts), total=len(texts)))\n",
    "\n",
    "df_notes_discharge_combined['TEXT'] = cleaned_texts\n",
    "df_notes_discharge_combined = df_notes_discharge_combined.merge(df_icd9_codes, on=['SUBJECT_ID', 'HADM_ID'])\n",
    "df_notes_discharge_combined.to_csv('cleaned_discharge_summaries.csv', index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Train Word2Vec on discharge summaries\n",
    "====================================="
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from gensim.models import Word2Vec\n",
    "import logging\n",
    "logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)\n",
    "\n",
    "sentences = [x.split(' ') for x in cleaned_texts]\n",
    "model = Word2Vec(sentences, size=300, window=10, min_count=2, workers=10)\n",
    "model.wv.save(\"mimic_embedding_model.wv\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
